1) Cleaning & introduction to the dataset
2) Proportion of the different airbnb flat in the city
3) Vizualisation of the different area on Singapore by price
4) Display flat by categories (Entire Home, Shared Room, Private room)
5) Wordcount to see which word appears most often
6) What About the availability ?
7) Who are the top host ?
8) What about the review ?
9) Sorter
10) Conclusion
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keplergl import KeplerGl
import plotly.express as px
import ipywidgets as widgets
df = pd.read_csv('singapore.csv')
df.head()
df.shape
#On supprime les colonnes qui nous servent à rien
df = df.drop(columns=["last_review","calculated_host_listings_count"])
df["reviews_per_month"].fillna("0", inplace = True) #Remplace valeur NULL par 0 dans les avis/mois
df.drop(df.loc[df['price']==0].index, inplace=True) #Supprime les lignes avec des prix à 0 euros
df.drop(df[(df['price']<1)].index, inplace=True)
df.drop(df[(df['price']>9000)].index, inplace=True)
df.shape
map_1 = KeplerGl(height=500)
map_1.add_data(data=df, name="data_2")
map_1
We can see that the repartition of the different flat is very oriented on the south of the country. It means that the South correspond to the middle activity (touristic, economical) of the country
plt.figure(figsize=(12,12))
a = sns.scatterplot(data=df, x='longitude', y='latitude', hue='neighbourhood_group', palette='rocket_r')
plt.xlabel('Latitude')
plt.ylabel("Longitude")
plt.legend(frameon=False, fontsize=13)
neigh = pd.DataFrame(df['neighbourhood_group'].value_counts())
neigh.columns=['Listings']
neigh['Neighbourhood Group'] = neigh.index
neigh.reset_index(drop=True, inplace=True)
neigh
fig = px.pie(neigh, values='Listings', names='Neighbourhood Group',title="Percentage listings Airbnb by Neighbourhood Group")
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')
fig.show()
plt.figure(figsize = (10,6))
prix = df[(df['price'] < 600)]
sns.distplot(prix["price"])
plt.title("Overall distribution of price")
plt.show()
plt.figure(figsize=(10,6))
sns.barplot(x=df['neighbourhood_group'], y=df['price'], palette="rocket_r")
plt.xlabel('neighbourhood_group', size ="12")
plt.ylabel('Average price by night\n', size ="12")
plt.title("Average price by night by neighbourhood of Singapore\n", fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
Not lot of differences between all the differents neighbourhood_group (maybe it can be explain by some outlier that you can find on all the differents neighbourhood group)
map_1 = KeplerGl(height=650)
map_1.add_data(data=df, name="data_1")
map_1
plt.figure(figsize=(10,6))
sns.countplot(x = 'room_type',hue = "neighbourhood_group",data = df, palette="rocket_r")
plt.title("Room Type most represent in Singapore by categories \n",size ="24")
top_nei = df['neighbourhood'].value_counts().reset_index().head(10)
top_nei = top_nei['index'].tolist()
plt.figure(figsize=(15,6))
vis_3 = sns.catplot(x='neighbourhood', hue='neighbourhood_group', col='room_type', data=df.loc[df['neighbourhood'].isin(top_nei)], kind='count', palette='GnBu_d')
vis_3.set_xticklabels(rotation=90)
map_2 = KeplerGl(height=600)
map_2.add_data(data=df, name="data_2")
map_2
plt.figure(figsize=(10,6))
sns.barplot(x=df['room_type'], y=df['price'], palette="rocket_r")
plt.xlabel('Room_type', size ="12")
plt.ylabel('Average price by night\n', size ="12")
plt.title("Average price by night by room type of Singapore\n", fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
plt.figure(figsize=(10,6))
text = " ".join(str(each) for each in df.name)
stopwords = set(STOPWORDS)
stopwords.update('city', 'Singapore', 'Apartment','room', 'Room', 'singapore','Bedroom')
wordcloud = WordCloud(max_words=200, background_color="white").generate(text)
plt.figure(figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
We can see a lot some word like Modern, Cosy, Master Room to describe all the different room, but also some touristic places like Farrer Park, city Center iong Bahru giving us the most touristic place in the country
plt.figure(figsize=(10,6))
plt.scatter(df.longitude, df.latitude, c=df.availability_365, cmap='spring', edgecolor='black', linewidth=1\
, alpha=1)
cbar = plt.colorbar()
cbar.set_label('availability_365')
It seem that there no specific place which you can find airbnb room for long time, because all the different place of Singapore are interactive (Big university, Huge companies) (Homogenous for the availability)
fig = px.scatter_3d(df, x='price', y='neighbourhood_group', z='availability_365', color='price',
hover_name='name', hover_data=['price', 'minimum_nights', 'id'],
template='plotly_dark', opacity=0.9, title='3d visualisation',)
fig.show()
import plotly.graph_objects as go
fig = go.Figure(data=
go.Parcoords(
line = dict(color = df['price'],
colorscale = 'Electric',
showscale = True,
cmin = 0,
cmax = 2000),
dimensions = list([
dict(range = [0,2000],
constraintrange = [0,2000],
label = "Price", values = df['price']),
dict(range = [0,400],
label = "Number of Review", values = df['number_of_reviews']),
dict(range = [1, 1000],
label = 'Availability_365', values = df['availability_365'])
])
)
)
fig.show()
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr())
There is no major correlation, we can insure that the availability, the neighbourhood, or also the minimum night don't play a role on the price
nbre_avismax = df[['number_of_reviews','host_id', 'neighbourhood_group']]
toz = nbre_avismax.sort_values(by=['number_of_reviews'], ascending=False)
toz.set_index('host_id')
toz = toz.head(10)
plt.subplots(figsize=(10,5))
sns.barplot(x = toz['host_id'] ,y= toz['number_of_reviews'], palette="rocket_r")
RVQ = df[['neighbourhood_group','number_of_reviews']].groupby('neighbourhood_group').mean()
rrr = RVQ.sort_values(by = 'number_of_reviews', ascending=False)
rrr.plot(kind='bar', color='red')
map_1 = KeplerGl(height=650)
map_1.add_data(data=df, name="data_1")
map_1
Indeed, we can see that there is no place more dynamic than an other because the number of revew reflect the trafic of the flat and of course, if a flat got a lot of review, this means that the place is dynamic It can be explain by two fact : Singapore is a small country in terms of superficy, and Singapore is very well well served by public transportation
tophost = (pd.DataFrame(df.host_id.value_counts()))
tophost.columns=['Listings']
tophost['host_id'] = tophost.index
tophost.reset_index(drop=True, inplace=True)
tophost.head()
fig = px.pie(tophost, values='Listings', names='host_id',title="Percentage listings Airbnb by Neighbourhood Group")
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')
fig.show()
It seem that some people or agency, manage many different flat on Singapore so it's a real business
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
@interact
def how_articles_more_than(column=['price'],
x=(10, 150)):
return df.loc[df[column] < x]
Singapore is a very popular despite its small supercy. Furthemore, it is very touristic, and this explain the popularity of Airbnb Plateform on the country. It's for some people, a real business which is illegal. The country is homogeneous, there is no place very cheaper than an other.